Revert "[VectorCombine] Shrink loads used in shufflevector rebroadcasts" #151960

RKSimon · 2025-08-04T13:28:58Z

Reverts #128938 while a crash regression is investigated

…ts (#128…" This reverts commit 1feed44.

llvmbot · 2025-08-04T13:29:41Z

@llvm/pr-subscribers-clang

Author: Simon Pilgrim (RKSimon)

Changes

Reverts llvm/llvm-project#128938 while a crash regression is investigated

Patch is 38.72 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/151960.diff

7 Files Affected:

(modified) clang/test/CodeGenOpenCL/preserve_vec3.cl (+11-11)
(modified) llvm/lib/Transforms/Vectorize/VectorCombine.cpp (-128)
(modified) llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll (+4-4)
(modified) llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll (+4-2)
(modified) llvm/test/Transforms/VectorCombine/X86/load-widening.ll (+2-2)
(modified) llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll (+15-9)
(removed) llvm/test/Transforms/VectorCombine/load-shufflevector.ll (-392)

diff --git a/clang/test/CodeGenOpenCL/preserve_vec3.cl b/clang/test/CodeGenOpenCL/preserve_vec3.cl
index e73657e30d884..49ebae6fc7013 100644
--- a/clang/test/CodeGenOpenCL/preserve_vec3.cl
+++ b/clang/test/CodeGenOpenCL/preserve_vec3.cl
@@ -11,8 +11,8 @@ typedef float float4 __attribute__((ext_vector_type(4)));
 // CHECK-LABEL: define dso_local spir_kernel void @foo(
 // CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] !kernel_arg_addr_space [[META3:![0-9]+]] !kernel_arg_access_qual [[META4:![0-9]+]] !kernel_arg_type [[META5:![0-9]+]] !kernel_arg_base_type [[META6:![0-9]+]] !kernel_arg_type_qual [[META7:![0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16
-// CHECK-NEXT:    [[EXTRACTVEC1:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16
+// CHECK-NEXT:    [[EXTRACTVEC1:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
 // CHECK-NEXT:    store <4 x float> [[EXTRACTVEC1]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8:![0-9]+]]
 // CHECK-NEXT:    ret void
 //
@@ -23,8 +23,8 @@ void kernel foo(global float3 *a, global float3 *b) {
 // CHECK-LABEL: define dso_local spir_kernel void @float4_to_float3(
 // CHECK-SAME: ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[A:%.*]], ptr addrspace(1) noundef readonly align 16 captures(none) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META11:![0-9]+]] !kernel_arg_base_type [[META12:![0-9]+]] !kernel_arg_type_qual [[META7]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
-// CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
+// CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
 // CHECK-NEXT:    store <4 x float> [[EXTRACTVEC]], ptr addrspace(1) [[A]], align 16, !tbaa [[TBAA8]]
 // CHECK-NEXT:    ret void
 //
@@ -35,8 +35,8 @@ void kernel float4_to_float3(global float3 *a, global float4 *b) {
 // CHECK-LABEL: define dso_local spir_kernel void @float3_to_float4(
 // CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META11]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META7]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16
-// CHECK-NEXT:    [[ASTYPE:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16
+// CHECK-NEXT:    [[ASTYPE:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
 // CHECK-NEXT:    store <4 x float> [[ASTYPE]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
 // CHECK-NEXT:    ret void
 //
@@ -47,9 +47,9 @@ void kernel float3_to_float4(global float3 *a, global float4 *b) {
 // CHECK-LABEL: define dso_local spir_kernel void @float3_to_double2(
 // CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META13:![0-9]+]] !kernel_arg_base_type [[META14:![0-9]+]] !kernel_arg_type_qual [[META7]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
-// CHECK-NEXT:    store <4 x float> [[TMP1]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
+// CHECK-NEXT:    [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
 // CHECK-NEXT:    ret void
 //
 void kernel float3_to_double2(global float3 *a, global double2 *b) {
@@ -59,8 +59,8 @@ void kernel float3_to_double2(global float3 *a, global double2 *b) {
 // CHECK-LABEL: define dso_local spir_kernel void @char8_to_short3(
 // CHECK-SAME: ptr addrspace(1) noundef writeonly align 8 captures(none) initializes((0, 8)) [[A:%.*]], ptr addrspace(1) noundef readonly align 8 captures(none) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META16:![0-9]+]] !kernel_arg_type_qual [[META7]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <3 x i16>, ptr addrspace(1) [[B]], align 8, !tbaa [[TBAA8]]
-// CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <3 x i16> [[TMP0]], <3 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr addrspace(1) [[B]], align 8, !tbaa [[TBAA8]]
+// CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
 // CHECK-NEXT:    store <4 x i16> [[EXTRACTVEC]], ptr addrspace(1) [[A]], align 8, !tbaa [[TBAA8]]
 // CHECK-NEXT:    ret void
 //
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index fef0934010df4..6345b18b809a6 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -16,7 +16,6 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/ScopeExit.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
@@ -30,16 +29,13 @@
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Instructions.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include <numeric>
-#include <optional>
 #include <queue>
 #include <set>
-#include <tuple>
 
 #define DEBUG_TYPE "vector-combine"
 #include "llvm/Transforms/Utils/InstructionWorklist.h"
@@ -141,7 +137,6 @@ class VectorCombine {
   bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
   bool foldInterleaveIntrinsics(Instruction &I);
   bool shrinkType(Instruction &I);
-  bool shrinkLoadForShuffles(Instruction &I);
 
   void replaceValue(Value &Old, Value &New) {
     LLVM_DEBUG(dbgs() << "VC: Replacing: " << Old << '\n');
@@ -3866,126 +3861,6 @@ bool VectorCombine::foldInterleaveIntrinsics(Instruction &I) {
   return true;
 }
 
-// Attempt to shrink loads that are only used by shufflevector instructions.
-bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
-  auto *OldLoad = dyn_cast<LoadInst>(&I);
-  if (!OldLoad || !OldLoad->isSimple())
-    return false;
-
-  auto *OldLoadTy = dyn_cast<FixedVectorType>(OldLoad->getType());
-  if (!OldLoadTy)
-    return false;
-
-  unsigned const OldNumElements = OldLoadTy->getNumElements();
-
-  // Search all uses of load. If all uses are shufflevector instructions, and
-  // the second operands are all poison values, find the minimum and maximum
-  // indices of the vector elements referenced by all shuffle masks.
-  // Otherwise return `std::nullopt`.
-  using IndexRange = std::pair<int, int>;
-  auto GetIndexRangeInShuffles = [&]() -> std::optional<IndexRange> {
-    IndexRange OutputRange = IndexRange(OldNumElements, -1);
-    for (llvm::Use &Use : I.uses()) {
-      // Ensure all uses match the required pattern.
-      User *Shuffle = Use.getUser();
-      ArrayRef<int> Mask;
-
-      if (!match(Shuffle,
-                 m_Shuffle(m_Specific(OldLoad), m_Undef(), m_Mask(Mask))))
-        return std::nullopt;
-
-      // Ignore shufflevector instructions that have no uses.
-      if (Shuffle->use_empty())
-        continue;
-
-      // Find the min and max indices used by the shufflevector instruction.
-      for (int Index : Mask) {
-        if (Index >= 0 && Index < static_cast<int>(OldNumElements)) {
-          OutputRange.first = std::min(Index, OutputRange.first);
-          OutputRange.second = std::max(Index, OutputRange.second);
-        }
-      }
-    }
-
-    if (OutputRange.second < OutputRange.first)
-      return std::nullopt;
-
-    return OutputRange;
-  };
-
-  // Get the range of vector elements used by shufflevector instructions.
-  if (std::optional<IndexRange> Indices = GetIndexRangeInShuffles()) {
-    unsigned const NewNumElements = Indices->second + 1u;
-
-    // If the range of vector elements is smaller than the full load, attempt
-    // to create a smaller load.
-    if (NewNumElements < OldNumElements) {
-      IRBuilder Builder(&I);
-      Builder.SetCurrentDebugLocation(I.getDebugLoc());
-
-      // Calculate costs of old and new ops.
-      Type *ElemTy = OldLoadTy->getElementType();
-      FixedVectorType *NewLoadTy = FixedVectorType::get(ElemTy, NewNumElements);
-      Value *PtrOp = OldLoad->getPointerOperand();
-
-      InstructionCost OldCost = TTI.getMemoryOpCost(
-          Instruction::Load, OldLoad->getType(), OldLoad->getAlign(),
-          OldLoad->getPointerAddressSpace(), CostKind);
-      InstructionCost NewCost =
-          TTI.getMemoryOpCost(Instruction::Load, NewLoadTy, OldLoad->getAlign(),
-                              OldLoad->getPointerAddressSpace(), CostKind);
-
-      using UseEntry = std::pair<ShuffleVectorInst *, std::vector<int>>;
-      SmallVector<UseEntry, 4u> NewUses;
-
-      for (llvm::Use &Use : I.uses()) {
-        auto *Shuffle = cast<ShuffleVectorInst>(Use.getUser());
-        ArrayRef<int> OldMask = Shuffle->getShuffleMask();
-
-        // Create entry for new use.
-        NewUses.push_back({Shuffle, OldMask});
-
-        // Update costs.
-        OldCost +=
-            TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, Shuffle->getType(),
-                               OldLoadTy, OldMask, CostKind);
-        NewCost +=
-            TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, Shuffle->getType(),
-                               NewLoadTy, OldMask, CostKind);
-      }
-
-      LLVM_DEBUG(
-          dbgs() << "Found a load used only by shufflevector instructions: "
-                 << I << "\n  OldCost: " << OldCost
-                 << " vs NewCost: " << NewCost << "\n");
-
-      if (OldCost < NewCost || !NewCost.isValid())
-        return false;
-
-      // Create new load of smaller vector.
-      auto *NewLoad = cast<LoadInst>(
-          Builder.CreateAlignedLoad(NewLoadTy, PtrOp, OldLoad->getAlign()));
-      NewLoad->copyMetadata(I);
-
-      // Replace all uses.
-      for (UseEntry &Use : NewUses) {
-        ShuffleVectorInst *Shuffle = Use.first;
-        std::vector<int> &NewMask = Use.second;
-
-        Builder.SetInsertPoint(Shuffle);
-        Builder.SetCurrentDebugLocation(Shuffle->getDebugLoc());
-        Value *NewShuffle = Builder.CreateShuffleVector(
-            NewLoad, PoisonValue::get(NewLoadTy), NewMask);
-
-        replaceValue(*Shuffle, *NewShuffle);
-      }
-
-      return true;
-    }
-  }
-  return false;
-}
-
 /// This is the entry point for all transforms. Pass manager differences are
 /// handled in the callers of this function.
 bool VectorCombine::run() {
@@ -4062,9 +3937,6 @@ bool VectorCombine::run() {
         MadeChange |= foldSelectShuffle(I);
         MadeChange |= foldShuffleToIdentity(I);
         break;
-      case Instruction::Load:
-        MadeChange |= shrinkLoadForShuffles(I);
-        break;
       case Instruction::BitCast:
         MadeChange |= foldBitcastShuffle(I);
         break;
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll b/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll
index 9218cc2d019f8..85f6fceb5bdbe 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll
@@ -11,13 +11,13 @@ $getAt = comdat any
 
 define dso_local noundef <4 x float> @ConvertVectors_ByRef(ptr noundef nonnull align 16 dereferenceable(16) %0) #0 {
 ; SSE-LABEL: @ConvertVectors_ByRef(
-; SSE-NEXT:    [[TMP2:%.*]] = load <3 x float>, ptr [[TMP0:%.*]], align 16
-; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <3 x float> [[TMP2]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0:%.*]], align 16
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
 ; SSE-NEXT:    ret <4 x float> [[TMP3]]
 ;
 ; AVX-LABEL: @ConvertVectors_ByRef(
-; AVX-NEXT:    [[TMP2:%.*]] = load <3 x float>, ptr [[TMP0:%.*]], align 16
-; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <3 x float> [[TMP2]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+; AVX-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0:%.*]], align 16
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
 ; AVX-NEXT:    ret <4 x float> [[TMP3]]
 ;
   %2 = alloca ptr, align 8
diff --git a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
index 0c2346e616e36..977da754ec5a7 100644
--- a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
@@ -252,7 +252,8 @@ define <4 x i32> @unsafe_load_i32_insert_v4i32_addrspace(ptr align 16 dereferenc
 define <8 x i16> @gep01_load_i16_insert_v8i16(ptr align 16 dereferenceable(18) %p) nofree nosync {
 ; CHECK-LABEL: @gep01_load_i16_insert_v8i16(
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1
-; CHECK-NEXT:    [[R:%.*]] = load <8 x i16>, ptr [[GEP]], align 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[GEP]], align 2
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    ret <8 x i16> [[R]]
 ;
   %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 1
@@ -340,7 +341,8 @@ define <4 x i32> @gep013_bitcast_load_i32_insert_v4i32(ptr align 1 dereferenceab
 define <8 x i16> @gep10_load_i16_insert_v8i16(ptr align 16 dereferenceable(32) %p) nofree nosync {
 ; CHECK-LABEL: @gep10_load_i16_insert_v8i16(
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 1, i64 0
-; CHECK-NEXT:    [[R:%.*]] = load <8 x i16>, ptr [[GEP]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[GEP]], align 16
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    ret <8 x i16> [[R]]
 ;
   %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 1, i64 0
diff --git a/llvm/test/Transforms/VectorCombine/X86/load-widening.ll b/llvm/test/Transforms/VectorCombine/X86/load-widening.ll
index eacc40bfa9b53..30a089818074e 100644
--- a/llvm/test/Transforms/VectorCombine/X86/load-widening.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/load-widening.ll
@@ -443,8 +443,8 @@ define <8 x float> @load_v2f32_v8f32_hwasan(ptr dereferenceable(32) %p) sanitize
 
 define <4 x i32> @load_v2i32_v4i32_asan(ptr dereferenceable(16) %p) sanitize_address {
 ; CHECK-LABEL: @load_v2i32_v4i32_asan(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i32>, ptr [[P:%.*]], align 1
-; CHECK-NEXT:    [[S:%.*]] = shufflevector <1 x i32> [[TMP1]], <1 x i32> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[L:%.*]] = load <2 x i32>, ptr [[P:%.*]], align 1
+; CHECK-NEXT:    [[S:%.*]] = shufflevector <2 x i32> [[L]], <2 x i32> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    ret <4 x i32> [[S]]
 ;
   %l = load <2 x i32>, ptr %p, align 1
diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll
index eddfc57a7d256..b30dc9ffdc596 100644
--- a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll
@@ -47,12 +47,21 @@ define <8 x i32> @concat_extract_subvectors_poison(<8 x i32> %x) {
 ; broadcast loads are free on AVX (and blends are much cheap than general 2-operand shuffles)
 
 define  <4 x double> @blend_broadcasts_v4f64(ptr %p0, ptr %p1)  {
-; CHECK-LABEL: define <4 x double> @blend_broadcasts_v4f64(
-; CHECK-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <1 x double>, ptr [[P0]], align 32
-; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x double>, ptr [[P1]], align 32
-; CHECK-NEXT:    [[BLEND:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 1, i32 0>
-; CHECK-NEXT:    ret <4 x double> [[BLEND]]
+; SSE-LABEL: define <4 x double> @blend_broadcasts_v4f64(
+; SSE-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] {
+; SSE-NEXT:    [[LD0:%.*]] = load <4 x double>, ptr [[P0]], align 32
+; SSE-NEXT:    [[LD1:%.*]] = load <4 x double>, ptr [[P1]], align 32
+; SSE-NEXT:    [[BLEND:%.*]] = shufflevector <4 x double> [[LD0]], <4 x double> [[LD1]], <4 x i32> <i32 0, i32 4, i32 4, i32 0>
+; SSE-NEXT:    ret <4 x double> [[BLEND]]
+;
+; AVX-LABEL: define <4 x double> @blend_broadcasts_v4f64(
+; AVX-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] {
+; AVX-NEXT:    [[LD0:%.*]] = load <4 x double>, ptr [[P0]], align 32
+; AVX-NEXT:    [[LD1:%.*]] = load <4 x double>, ptr [[P1]], align 32
+; AVX-NEXT:    [[BCST0:%.*]] = shufflevector <4 x double> [[LD0]], <4 x double> undef, <4 x i32> zeroinitializer
+; AVX-NEXT:    [[BCST1:%.*]] = shufflevector <4 x double> [[LD1]], <4 x double> undef, <4 x i32> zeroinitializer
+; AVX-NEXT:    [[BLEND:%.*]] = shufflevector <4 x double> [[BCST0]], <4 x double> [[BCST1]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+; AVX-NEXT:    ret <4 x double> [[BLEND]]
 ;
   %ld0 = load <4 x double>, ptr %p0, align 32
   %ld1 = load <4 x double>, ptr %p1, align 32
@@ -72,6 +81,3 @@ define <2 x float> @PR86068(<2 x float> %a0, <2 x float> %a1) {
   %s2 = shufflevector <2 x float> %s1, <2 x float> %a0, <2 x i32> <i32 0, i32 3>
   ret <2 x float> %s2
 }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; AVX: {{.*}}
-; SSE: {{.*}}
diff --git a/llvm/test/Transforms/VectorCombine/load-shufflevector.ll b/llvm/test/Transforms/VectorCombine/load-shufflevector.ll
deleted file mode 100644
index 467c20c5da0c2..0000000000000
--- a/llvm/test/Transforms/VectorCombine/load-shufflevector.ll
+++ /dev/null
@@ -1,392 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -passes=vector-combine -S < %s | FileCheck %s
-
-define <8 x half> @shuffle_v4_v8f16_r0_1_volatile(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr {
-; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_r0_1_volatile(
-; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x half>, ptr addrspace(1) [[ARG0]], align 32
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x half>...
[truncated]

llvmbot · 2025-08-04T13:29:42Z

@llvm/pr-subscribers-llvm-transforms

Author: Simon Pilgrim (RKSimon)

Changes

Reverts llvm/llvm-project#128938 while a crash regression is investigated

Patch is 38.72 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/151960.diff

7 Files Affected:

(modified) clang/test/CodeGenOpenCL/preserve_vec3.cl (+11-11)
(modified) llvm/lib/Transforms/Vectorize/VectorCombine.cpp (-128)
(modified) llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll (+4-4)
(modified) llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll (+4-2)
(modified) llvm/test/Transforms/VectorCombine/X86/load-widening.ll (+2-2)
(modified) llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll (+15-9)
(removed) llvm/test/Transforms/VectorCombine/load-shufflevector.ll (-392)

diff --git a/clang/test/CodeGenOpenCL/preserve_vec3.cl b/clang/test/CodeGenOpenCL/preserve_vec3.cl
index e73657e30d884..49ebae6fc7013 100644
--- a/clang/test/CodeGenOpenCL/preserve_vec3.cl
+++ b/clang/test/CodeGenOpenCL/preserve_vec3.cl
@@ -11,8 +11,8 @@ typedef float float4 __attribute__((ext_vector_type(4)));
 // CHECK-LABEL: define dso_local spir_kernel void @foo(
 // CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] !kernel_arg_addr_space [[META3:![0-9]+]] !kernel_arg_access_qual [[META4:![0-9]+]] !kernel_arg_type [[META5:![0-9]+]] !kernel_arg_base_type [[META6:![0-9]+]] !kernel_arg_type_qual [[META7:![0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16
-// CHECK-NEXT:    [[EXTRACTVEC1:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16
+// CHECK-NEXT:    [[EXTRACTVEC1:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
 // CHECK-NEXT:    store <4 x float> [[EXTRACTVEC1]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8:![0-9]+]]
 // CHECK-NEXT:    ret void
 //
@@ -23,8 +23,8 @@ void kernel foo(global float3 *a, global float3 *b) {
 // CHECK-LABEL: define dso_local spir_kernel void @float4_to_float3(
 // CHECK-SAME: ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[A:%.*]], ptr addrspace(1) noundef readonly align 16 captures(none) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META11:![0-9]+]] !kernel_arg_base_type [[META12:![0-9]+]] !kernel_arg_type_qual [[META7]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
-// CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
+// CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
 // CHECK-NEXT:    store <4 x float> [[EXTRACTVEC]], ptr addrspace(1) [[A]], align 16, !tbaa [[TBAA8]]
 // CHECK-NEXT:    ret void
 //
@@ -35,8 +35,8 @@ void kernel float4_to_float3(global float3 *a, global float4 *b) {
 // CHECK-LABEL: define dso_local spir_kernel void @float3_to_float4(
 // CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META11]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META7]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16
-// CHECK-NEXT:    [[ASTYPE:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16
+// CHECK-NEXT:    [[ASTYPE:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
 // CHECK-NEXT:    store <4 x float> [[ASTYPE]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
 // CHECK-NEXT:    ret void
 //
@@ -47,9 +47,9 @@ void kernel float3_to_float4(global float3 *a, global float4 *b) {
 // CHECK-LABEL: define dso_local spir_kernel void @float3_to_double2(
 // CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META13:![0-9]+]] !kernel_arg_base_type [[META14:![0-9]+]] !kernel_arg_type_qual [[META7]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
-// CHECK-NEXT:    store <4 x float> [[TMP1]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
+// CHECK-NEXT:    [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
 // CHECK-NEXT:    ret void
 //
 void kernel float3_to_double2(global float3 *a, global double2 *b) {
@@ -59,8 +59,8 @@ void kernel float3_to_double2(global float3 *a, global double2 *b) {
 // CHECK-LABEL: define dso_local spir_kernel void @char8_to_short3(
 // CHECK-SAME: ptr addrspace(1) noundef writeonly align 8 captures(none) initializes((0, 8)) [[A:%.*]], ptr addrspace(1) noundef readonly align 8 captures(none) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META16:![0-9]+]] !kernel_arg_type_qual [[META7]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <3 x i16>, ptr addrspace(1) [[B]], align 8, !tbaa [[TBAA8]]
-// CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <3 x i16> [[TMP0]], <3 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr addrspace(1) [[B]], align 8, !tbaa [[TBAA8]]
+// CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
 // CHECK-NEXT:    store <4 x i16> [[EXTRACTVEC]], ptr addrspace(1) [[A]], align 8, !tbaa [[TBAA8]]
 // CHECK-NEXT:    ret void
 //
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index fef0934010df4..6345b18b809a6 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -16,7 +16,6 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/ScopeExit.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
@@ -30,16 +29,13 @@
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Instructions.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include <numeric>
-#include <optional>
 #include <queue>
 #include <set>
-#include <tuple>
 
 #define DEBUG_TYPE "vector-combine"
 #include "llvm/Transforms/Utils/InstructionWorklist.h"
@@ -141,7 +137,6 @@ class VectorCombine {
   bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
   bool foldInterleaveIntrinsics(Instruction &I);
   bool shrinkType(Instruction &I);
-  bool shrinkLoadForShuffles(Instruction &I);
 
   void replaceValue(Value &Old, Value &New) {
     LLVM_DEBUG(dbgs() << "VC: Replacing: " << Old << '\n');
@@ -3866,126 +3861,6 @@ bool VectorCombine::foldInterleaveIntrinsics(Instruction &I) {
   return true;
 }
 
-// Attempt to shrink loads that are only used by shufflevector instructions.
-bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
-  auto *OldLoad = dyn_cast<LoadInst>(&I);
-  if (!OldLoad || !OldLoad->isSimple())
-    return false;
-
-  auto *OldLoadTy = dyn_cast<FixedVectorType>(OldLoad->getType());
-  if (!OldLoadTy)
-    return false;
-
-  unsigned const OldNumElements = OldLoadTy->getNumElements();
-
-  // Search all uses of load. If all uses are shufflevector instructions, and
-  // the second operands are all poison values, find the minimum and maximum
-  // indices of the vector elements referenced by all shuffle masks.
-  // Otherwise return `std::nullopt`.
-  using IndexRange = std::pair<int, int>;
-  auto GetIndexRangeInShuffles = [&]() -> std::optional<IndexRange> {
-    IndexRange OutputRange = IndexRange(OldNumElements, -1);
-    for (llvm::Use &Use : I.uses()) {
-      // Ensure all uses match the required pattern.
-      User *Shuffle = Use.getUser();
-      ArrayRef<int> Mask;
-
-      if (!match(Shuffle,
-                 m_Shuffle(m_Specific(OldLoad), m_Undef(), m_Mask(Mask))))
-        return std::nullopt;
-
-      // Ignore shufflevector instructions that have no uses.
-      if (Shuffle->use_empty())
-        continue;
-
-      // Find the min and max indices used by the shufflevector instruction.
-      for (int Index : Mask) {
-        if (Index >= 0 && Index < static_cast<int>(OldNumElements)) {
-          OutputRange.first = std::min(Index, OutputRange.first);
-          OutputRange.second = std::max(Index, OutputRange.second);
-        }
-      }
-    }
-
-    if (OutputRange.second < OutputRange.first)
-      return std::nullopt;
-
-    return OutputRange;
-  };
-
-  // Get the range of vector elements used by shufflevector instructions.
-  if (std::optional<IndexRange> Indices = GetIndexRangeInShuffles()) {
-    unsigned const NewNumElements = Indices->second + 1u;
-
-    // If the range of vector elements is smaller than the full load, attempt
-    // to create a smaller load.
-    if (NewNumElements < OldNumElements) {
-      IRBuilder Builder(&I);
-      Builder.SetCurrentDebugLocation(I.getDebugLoc());
-
-      // Calculate costs of old and new ops.
-      Type *ElemTy = OldLoadTy->getElementType();
-      FixedVectorType *NewLoadTy = FixedVectorType::get(ElemTy, NewNumElements);
-      Value *PtrOp = OldLoad->getPointerOperand();
-
-      InstructionCost OldCost = TTI.getMemoryOpCost(
-          Instruction::Load, OldLoad->getType(), OldLoad->getAlign(),
-          OldLoad->getPointerAddressSpace(), CostKind);
-      InstructionCost NewCost =
-          TTI.getMemoryOpCost(Instruction::Load, NewLoadTy, OldLoad->getAlign(),
-                              OldLoad->getPointerAddressSpace(), CostKind);
-
-      using UseEntry = std::pair<ShuffleVectorInst *, std::vector<int>>;
-      SmallVector<UseEntry, 4u> NewUses;
-
-      for (llvm::Use &Use : I.uses()) {
-        auto *Shuffle = cast<ShuffleVectorInst>(Use.getUser());
-        ArrayRef<int> OldMask = Shuffle->getShuffleMask();
-
-        // Create entry for new use.
-        NewUses.push_back({Shuffle, OldMask});
-
-        // Update costs.
-        OldCost +=
-            TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, Shuffle->getType(),
-                               OldLoadTy, OldMask, CostKind);
-        NewCost +=
-            TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, Shuffle->getType(),
-                               NewLoadTy, OldMask, CostKind);
-      }
-
-      LLVM_DEBUG(
-          dbgs() << "Found a load used only by shufflevector instructions: "
-                 << I << "\n  OldCost: " << OldCost
-                 << " vs NewCost: " << NewCost << "\n");
-
-      if (OldCost < NewCost || !NewCost.isValid())
-        return false;
-
-      // Create new load of smaller vector.
-      auto *NewLoad = cast<LoadInst>(
-          Builder.CreateAlignedLoad(NewLoadTy, PtrOp, OldLoad->getAlign()));
-      NewLoad->copyMetadata(I);
-
-      // Replace all uses.
-      for (UseEntry &Use : NewUses) {
-        ShuffleVectorInst *Shuffle = Use.first;
-        std::vector<int> &NewMask = Use.second;
-
-        Builder.SetInsertPoint(Shuffle);
-        Builder.SetCurrentDebugLocation(Shuffle->getDebugLoc());
-        Value *NewShuffle = Builder.CreateShuffleVector(
-            NewLoad, PoisonValue::get(NewLoadTy), NewMask);
-
-        replaceValue(*Shuffle, *NewShuffle);
-      }
-
-      return true;
-    }
-  }
-  return false;
-}
-
 /// This is the entry point for all transforms. Pass manager differences are
 /// handled in the callers of this function.
 bool VectorCombine::run() {
@@ -4062,9 +3937,6 @@ bool VectorCombine::run() {
         MadeChange |= foldSelectShuffle(I);
         MadeChange |= foldShuffleToIdentity(I);
         break;
-      case Instruction::Load:
-        MadeChange |= shrinkLoadForShuffles(I);
-        break;
       case Instruction::BitCast:
         MadeChange |= foldBitcastShuffle(I);
         break;
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll b/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll
index 9218cc2d019f8..85f6fceb5bdbe 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll
@@ -11,13 +11,13 @@ $getAt = comdat any
 
 define dso_local noundef <4 x float> @ConvertVectors_ByRef(ptr noundef nonnull align 16 dereferenceable(16) %0) #0 {
 ; SSE-LABEL: @ConvertVectors_ByRef(
-; SSE-NEXT:    [[TMP2:%.*]] = load <3 x float>, ptr [[TMP0:%.*]], align 16
-; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <3 x float> [[TMP2]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0:%.*]], align 16
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
 ; SSE-NEXT:    ret <4 x float> [[TMP3]]
 ;
 ; AVX-LABEL: @ConvertVectors_ByRef(
-; AVX-NEXT:    [[TMP2:%.*]] = load <3 x float>, ptr [[TMP0:%.*]], align 16
-; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <3 x float> [[TMP2]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+; AVX-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0:%.*]], align 16
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
 ; AVX-NEXT:    ret <4 x float> [[TMP3]]
 ;
   %2 = alloca ptr, align 8
diff --git a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
index 0c2346e616e36..977da754ec5a7 100644
--- a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
@@ -252,7 +252,8 @@ define <4 x i32> @unsafe_load_i32_insert_v4i32_addrspace(ptr align 16 dereferenc
 define <8 x i16> @gep01_load_i16_insert_v8i16(ptr align 16 dereferenceable(18) %p) nofree nosync {
 ; CHECK-LABEL: @gep01_load_i16_insert_v8i16(
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1
-; CHECK-NEXT:    [[R:%.*]] = load <8 x i16>, ptr [[GEP]], align 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[GEP]], align 2
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    ret <8 x i16> [[R]]
 ;
   %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 1
@@ -340,7 +341,8 @@ define <4 x i32> @gep013_bitcast_load_i32_insert_v4i32(ptr align 1 dereferenceab
 define <8 x i16> @gep10_load_i16_insert_v8i16(ptr align 16 dereferenceable(32) %p) nofree nosync {
 ; CHECK-LABEL: @gep10_load_i16_insert_v8i16(
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 1, i64 0
-; CHECK-NEXT:    [[R:%.*]] = load <8 x i16>, ptr [[GEP]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[GEP]], align 16
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    ret <8 x i16> [[R]]
 ;
   %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 1, i64 0
diff --git a/llvm/test/Transforms/VectorCombine/X86/load-widening.ll b/llvm/test/Transforms/VectorCombine/X86/load-widening.ll
index eacc40bfa9b53..30a089818074e 100644
--- a/llvm/test/Transforms/VectorCombine/X86/load-widening.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/load-widening.ll
@@ -443,8 +443,8 @@ define <8 x float> @load_v2f32_v8f32_hwasan(ptr dereferenceable(32) %p) sanitize
 
 define <4 x i32> @load_v2i32_v4i32_asan(ptr dereferenceable(16) %p) sanitize_address {
 ; CHECK-LABEL: @load_v2i32_v4i32_asan(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i32>, ptr [[P:%.*]], align 1
-; CHECK-NEXT:    [[S:%.*]] = shufflevector <1 x i32> [[TMP1]], <1 x i32> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[L:%.*]] = load <2 x i32>, ptr [[P:%.*]], align 1
+; CHECK-NEXT:    [[S:%.*]] = shufflevector <2 x i32> [[L]], <2 x i32> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    ret <4 x i32> [[S]]
 ;
   %l = load <2 x i32>, ptr %p, align 1
diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll
index eddfc57a7d256..b30dc9ffdc596 100644
--- a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll
@@ -47,12 +47,21 @@ define <8 x i32> @concat_extract_subvectors_poison(<8 x i32> %x) {
 ; broadcast loads are free on AVX (and blends are much cheap than general 2-operand shuffles)
 
 define  <4 x double> @blend_broadcasts_v4f64(ptr %p0, ptr %p1)  {
-; CHECK-LABEL: define <4 x double> @blend_broadcasts_v4f64(
-; CHECK-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <1 x double>, ptr [[P0]], align 32
-; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x double>, ptr [[P1]], align 32
-; CHECK-NEXT:    [[BLEND:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 1, i32 0>
-; CHECK-NEXT:    ret <4 x double> [[BLEND]]
+; SSE-LABEL: define <4 x double> @blend_broadcasts_v4f64(
+; SSE-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] {
+; SSE-NEXT:    [[LD0:%.*]] = load <4 x double>, ptr [[P0]], align 32
+; SSE-NEXT:    [[LD1:%.*]] = load <4 x double>, ptr [[P1]], align 32
+; SSE-NEXT:    [[BLEND:%.*]] = shufflevector <4 x double> [[LD0]], <4 x double> [[LD1]], <4 x i32> <i32 0, i32 4, i32 4, i32 0>
+; SSE-NEXT:    ret <4 x double> [[BLEND]]
+;
+; AVX-LABEL: define <4 x double> @blend_broadcasts_v4f64(
+; AVX-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] {
+; AVX-NEXT:    [[LD0:%.*]] = load <4 x double>, ptr [[P0]], align 32
+; AVX-NEXT:    [[LD1:%.*]] = load <4 x double>, ptr [[P1]], align 32
+; AVX-NEXT:    [[BCST0:%.*]] = shufflevector <4 x double> [[LD0]], <4 x double> undef, <4 x i32> zeroinitializer
+; AVX-NEXT:    [[BCST1:%.*]] = shufflevector <4 x double> [[LD1]], <4 x double> undef, <4 x i32> zeroinitializer
+; AVX-NEXT:    [[BLEND:%.*]] = shufflevector <4 x double> [[BCST0]], <4 x double> [[BCST1]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+; AVX-NEXT:    ret <4 x double> [[BLEND]]
 ;
   %ld0 = load <4 x double>, ptr %p0, align 32
   %ld1 = load <4 x double>, ptr %p1, align 32
@@ -72,6 +81,3 @@ define <2 x float> @PR86068(<2 x float> %a0, <2 x float> %a1) {
   %s2 = shufflevector <2 x float> %s1, <2 x float> %a0, <2 x i32> <i32 0, i32 3>
   ret <2 x float> %s2
 }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; AVX: {{.*}}
-; SSE: {{.*}}
diff --git a/llvm/test/Transforms/VectorCombine/load-shufflevector.ll b/llvm/test/Transforms/VectorCombine/load-shufflevector.ll
deleted file mode 100644
index 467c20c5da0c2..0000000000000
--- a/llvm/test/Transforms/VectorCombine/load-shufflevector.ll
+++ /dev/null
@@ -1,392 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -passes=vector-combine -S < %s | FileCheck %s
-
-define <8 x half> @shuffle_v4_v8f16_r0_1_volatile(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr {
-; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_r0_1_volatile(
-; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x half>, ptr addrspace(1) [[ARG0]], align 32
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x half>...
[truncated]

llvmbot · 2025-08-04T13:29:42Z

@llvm/pr-subscribers-vectorizers

Author: Simon Pilgrim (RKSimon)

Changes

Reverts llvm/llvm-project#128938 while a crash regression is investigated

Patch is 38.72 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/151960.diff

7 Files Affected:

(modified) clang/test/CodeGenOpenCL/preserve_vec3.cl (+11-11)
(modified) llvm/lib/Transforms/Vectorize/VectorCombine.cpp (-128)
(modified) llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll (+4-4)
(modified) llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll (+4-2)
(modified) llvm/test/Transforms/VectorCombine/X86/load-widening.ll (+2-2)
(modified) llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll (+15-9)
(removed) llvm/test/Transforms/VectorCombine/load-shufflevector.ll (-392)

diff --git a/clang/test/CodeGenOpenCL/preserve_vec3.cl b/clang/test/CodeGenOpenCL/preserve_vec3.cl
index e73657e30d884..49ebae6fc7013 100644
--- a/clang/test/CodeGenOpenCL/preserve_vec3.cl
+++ b/clang/test/CodeGenOpenCL/preserve_vec3.cl
@@ -11,8 +11,8 @@ typedef float float4 __attribute__((ext_vector_type(4)));
 // CHECK-LABEL: define dso_local spir_kernel void @foo(
 // CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] !kernel_arg_addr_space [[META3:![0-9]+]] !kernel_arg_access_qual [[META4:![0-9]+]] !kernel_arg_type [[META5:![0-9]+]] !kernel_arg_base_type [[META6:![0-9]+]] !kernel_arg_type_qual [[META7:![0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16
-// CHECK-NEXT:    [[EXTRACTVEC1:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16
+// CHECK-NEXT:    [[EXTRACTVEC1:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
 // CHECK-NEXT:    store <4 x float> [[EXTRACTVEC1]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8:![0-9]+]]
 // CHECK-NEXT:    ret void
 //
@@ -23,8 +23,8 @@ void kernel foo(global float3 *a, global float3 *b) {
 // CHECK-LABEL: define dso_local spir_kernel void @float4_to_float3(
 // CHECK-SAME: ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[A:%.*]], ptr addrspace(1) noundef readonly align 16 captures(none) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META11:![0-9]+]] !kernel_arg_base_type [[META12:![0-9]+]] !kernel_arg_type_qual [[META7]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
-// CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
+// CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
 // CHECK-NEXT:    store <4 x float> [[EXTRACTVEC]], ptr addrspace(1) [[A]], align 16, !tbaa [[TBAA8]]
 // CHECK-NEXT:    ret void
 //
@@ -35,8 +35,8 @@ void kernel float4_to_float3(global float3 *a, global float4 *b) {
 // CHECK-LABEL: define dso_local spir_kernel void @float3_to_float4(
 // CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META11]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META7]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16
-// CHECK-NEXT:    [[ASTYPE:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16
+// CHECK-NEXT:    [[ASTYPE:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
 // CHECK-NEXT:    store <4 x float> [[ASTYPE]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
 // CHECK-NEXT:    ret void
 //
@@ -47,9 +47,9 @@ void kernel float3_to_float4(global float3 *a, global float4 *b) {
 // CHECK-LABEL: define dso_local spir_kernel void @float3_to_double2(
 // CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META13:![0-9]+]] !kernel_arg_base_type [[META14:![0-9]+]] !kernel_arg_type_qual [[META7]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
-// CHECK-NEXT:    store <4 x float> [[TMP1]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
+// CHECK-NEXT:    [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
 // CHECK-NEXT:    ret void
 //
 void kernel float3_to_double2(global float3 *a, global double2 *b) {
@@ -59,8 +59,8 @@ void kernel float3_to_double2(global float3 *a, global double2 *b) {
 // CHECK-LABEL: define dso_local spir_kernel void @char8_to_short3(
 // CHECK-SAME: ptr addrspace(1) noundef writeonly align 8 captures(none) initializes((0, 8)) [[A:%.*]], ptr addrspace(1) noundef readonly align 8 captures(none) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META16:![0-9]+]] !kernel_arg_type_qual [[META7]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <3 x i16>, ptr addrspace(1) [[B]], align 8, !tbaa [[TBAA8]]
-// CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <3 x i16> [[TMP0]], <3 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr addrspace(1) [[B]], align 8, !tbaa [[TBAA8]]
+// CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
 // CHECK-NEXT:    store <4 x i16> [[EXTRACTVEC]], ptr addrspace(1) [[A]], align 8, !tbaa [[TBAA8]]
 // CHECK-NEXT:    ret void
 //
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index fef0934010df4..6345b18b809a6 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -16,7 +16,6 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/ScopeExit.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
@@ -30,16 +29,13 @@
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Instructions.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include <numeric>
-#include <optional>
 #include <queue>
 #include <set>
-#include <tuple>
 
 #define DEBUG_TYPE "vector-combine"
 #include "llvm/Transforms/Utils/InstructionWorklist.h"
@@ -141,7 +137,6 @@ class VectorCombine {
   bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
   bool foldInterleaveIntrinsics(Instruction &I);
   bool shrinkType(Instruction &I);
-  bool shrinkLoadForShuffles(Instruction &I);
 
   void replaceValue(Value &Old, Value &New) {
     LLVM_DEBUG(dbgs() << "VC: Replacing: " << Old << '\n');
@@ -3866,126 +3861,6 @@ bool VectorCombine::foldInterleaveIntrinsics(Instruction &I) {
   return true;
 }
 
-// Attempt to shrink loads that are only used by shufflevector instructions.
-bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
-  auto *OldLoad = dyn_cast<LoadInst>(&I);
-  if (!OldLoad || !OldLoad->isSimple())
-    return false;
-
-  auto *OldLoadTy = dyn_cast<FixedVectorType>(OldLoad->getType());
-  if (!OldLoadTy)
-    return false;
-
-  unsigned const OldNumElements = OldLoadTy->getNumElements();
-
-  // Search all uses of load. If all uses are shufflevector instructions, and
-  // the second operands are all poison values, find the minimum and maximum
-  // indices of the vector elements referenced by all shuffle masks.
-  // Otherwise return `std::nullopt`.
-  using IndexRange = std::pair<int, int>;
-  auto GetIndexRangeInShuffles = [&]() -> std::optional<IndexRange> {
-    IndexRange OutputRange = IndexRange(OldNumElements, -1);
-    for (llvm::Use &Use : I.uses()) {
-      // Ensure all uses match the required pattern.
-      User *Shuffle = Use.getUser();
-      ArrayRef<int> Mask;
-
-      if (!match(Shuffle,
-                 m_Shuffle(m_Specific(OldLoad), m_Undef(), m_Mask(Mask))))
-        return std::nullopt;
-
-      // Ignore shufflevector instructions that have no uses.
-      if (Shuffle->use_empty())
-        continue;
-
-      // Find the min and max indices used by the shufflevector instruction.
-      for (int Index : Mask) {
-        if (Index >= 0 && Index < static_cast<int>(OldNumElements)) {
-          OutputRange.first = std::min(Index, OutputRange.first);
-          OutputRange.second = std::max(Index, OutputRange.second);
-        }
-      }
-    }
-
-    if (OutputRange.second < OutputRange.first)
-      return std::nullopt;
-
-    return OutputRange;
-  };
-
-  // Get the range of vector elements used by shufflevector instructions.
-  if (std::optional<IndexRange> Indices = GetIndexRangeInShuffles()) {
-    unsigned const NewNumElements = Indices->second + 1u;
-
-    // If the range of vector elements is smaller than the full load, attempt
-    // to create a smaller load.
-    if (NewNumElements < OldNumElements) {
-      IRBuilder Builder(&I);
-      Builder.SetCurrentDebugLocation(I.getDebugLoc());
-
-      // Calculate costs of old and new ops.
-      Type *ElemTy = OldLoadTy->getElementType();
-      FixedVectorType *NewLoadTy = FixedVectorType::get(ElemTy, NewNumElements);
-      Value *PtrOp = OldLoad->getPointerOperand();
-
-      InstructionCost OldCost = TTI.getMemoryOpCost(
-          Instruction::Load, OldLoad->getType(), OldLoad->getAlign(),
-          OldLoad->getPointerAddressSpace(), CostKind);
-      InstructionCost NewCost =
-          TTI.getMemoryOpCost(Instruction::Load, NewLoadTy, OldLoad->getAlign(),
-                              OldLoad->getPointerAddressSpace(), CostKind);
-
-      using UseEntry = std::pair<ShuffleVectorInst *, std::vector<int>>;
-      SmallVector<UseEntry, 4u> NewUses;
-
-      for (llvm::Use &Use : I.uses()) {
-        auto *Shuffle = cast<ShuffleVectorInst>(Use.getUser());
-        ArrayRef<int> OldMask = Shuffle->getShuffleMask();
-
-        // Create entry for new use.
-        NewUses.push_back({Shuffle, OldMask});
-
-        // Update costs.
-        OldCost +=
-            TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, Shuffle->getType(),
-                               OldLoadTy, OldMask, CostKind);
-        NewCost +=
-            TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, Shuffle->getType(),
-                               NewLoadTy, OldMask, CostKind);
-      }
-
-      LLVM_DEBUG(
-          dbgs() << "Found a load used only by shufflevector instructions: "
-                 << I << "\n  OldCost: " << OldCost
-                 << " vs NewCost: " << NewCost << "\n");
-
-      if (OldCost < NewCost || !NewCost.isValid())
-        return false;
-
-      // Create new load of smaller vector.
-      auto *NewLoad = cast<LoadInst>(
-          Builder.CreateAlignedLoad(NewLoadTy, PtrOp, OldLoad->getAlign()));
-      NewLoad->copyMetadata(I);
-
-      // Replace all uses.
-      for (UseEntry &Use : NewUses) {
-        ShuffleVectorInst *Shuffle = Use.first;
-        std::vector<int> &NewMask = Use.second;
-
-        Builder.SetInsertPoint(Shuffle);
-        Builder.SetCurrentDebugLocation(Shuffle->getDebugLoc());
-        Value *NewShuffle = Builder.CreateShuffleVector(
-            NewLoad, PoisonValue::get(NewLoadTy), NewMask);
-
-        replaceValue(*Shuffle, *NewShuffle);
-      }
-
-      return true;
-    }
-  }
-  return false;
-}
-
 /// This is the entry point for all transforms. Pass manager differences are
 /// handled in the callers of this function.
 bool VectorCombine::run() {
@@ -4062,9 +3937,6 @@ bool VectorCombine::run() {
         MadeChange |= foldSelectShuffle(I);
         MadeChange |= foldShuffleToIdentity(I);
         break;
-      case Instruction::Load:
-        MadeChange |= shrinkLoadForShuffles(I);
-        break;
       case Instruction::BitCast:
         MadeChange |= foldBitcastShuffle(I);
         break;
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll b/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll
index 9218cc2d019f8..85f6fceb5bdbe 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll
@@ -11,13 +11,13 @@ $getAt = comdat any
 
 define dso_local noundef <4 x float> @ConvertVectors_ByRef(ptr noundef nonnull align 16 dereferenceable(16) %0) #0 {
 ; SSE-LABEL: @ConvertVectors_ByRef(
-; SSE-NEXT:    [[TMP2:%.*]] = load <3 x float>, ptr [[TMP0:%.*]], align 16
-; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <3 x float> [[TMP2]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0:%.*]], align 16
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
 ; SSE-NEXT:    ret <4 x float> [[TMP3]]
 ;
 ; AVX-LABEL: @ConvertVectors_ByRef(
-; AVX-NEXT:    [[TMP2:%.*]] = load <3 x float>, ptr [[TMP0:%.*]], align 16
-; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <3 x float> [[TMP2]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+; AVX-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0:%.*]], align 16
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
 ; AVX-NEXT:    ret <4 x float> [[TMP3]]
 ;
   %2 = alloca ptr, align 8
diff --git a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
index 0c2346e616e36..977da754ec5a7 100644
--- a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
@@ -252,7 +252,8 @@ define <4 x i32> @unsafe_load_i32_insert_v4i32_addrspace(ptr align 16 dereferenc
 define <8 x i16> @gep01_load_i16_insert_v8i16(ptr align 16 dereferenceable(18) %p) nofree nosync {
 ; CHECK-LABEL: @gep01_load_i16_insert_v8i16(
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1
-; CHECK-NEXT:    [[R:%.*]] = load <8 x i16>, ptr [[GEP]], align 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[GEP]], align 2
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    ret <8 x i16> [[R]]
 ;
   %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 1
@@ -340,7 +341,8 @@ define <4 x i32> @gep013_bitcast_load_i32_insert_v4i32(ptr align 1 dereferenceab
 define <8 x i16> @gep10_load_i16_insert_v8i16(ptr align 16 dereferenceable(32) %p) nofree nosync {
 ; CHECK-LABEL: @gep10_load_i16_insert_v8i16(
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 1, i64 0
-; CHECK-NEXT:    [[R:%.*]] = load <8 x i16>, ptr [[GEP]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[GEP]], align 16
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    ret <8 x i16> [[R]]
 ;
   %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 1, i64 0
diff --git a/llvm/test/Transforms/VectorCombine/X86/load-widening.ll b/llvm/test/Transforms/VectorCombine/X86/load-widening.ll
index eacc40bfa9b53..30a089818074e 100644
--- a/llvm/test/Transforms/VectorCombine/X86/load-widening.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/load-widening.ll
@@ -443,8 +443,8 @@ define <8 x float> @load_v2f32_v8f32_hwasan(ptr dereferenceable(32) %p) sanitize
 
 define <4 x i32> @load_v2i32_v4i32_asan(ptr dereferenceable(16) %p) sanitize_address {
 ; CHECK-LABEL: @load_v2i32_v4i32_asan(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i32>, ptr [[P:%.*]], align 1
-; CHECK-NEXT:    [[S:%.*]] = shufflevector <1 x i32> [[TMP1]], <1 x i32> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[L:%.*]] = load <2 x i32>, ptr [[P:%.*]], align 1
+; CHECK-NEXT:    [[S:%.*]] = shufflevector <2 x i32> [[L]], <2 x i32> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    ret <4 x i32> [[S]]
 ;
   %l = load <2 x i32>, ptr %p, align 1
diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll
index eddfc57a7d256..b30dc9ffdc596 100644
--- a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll
@@ -47,12 +47,21 @@ define <8 x i32> @concat_extract_subvectors_poison(<8 x i32> %x) {
 ; broadcast loads are free on AVX (and blends are much cheap than general 2-operand shuffles)
 
 define  <4 x double> @blend_broadcasts_v4f64(ptr %p0, ptr %p1)  {
-; CHECK-LABEL: define <4 x double> @blend_broadcasts_v4f64(
-; CHECK-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <1 x double>, ptr [[P0]], align 32
-; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x double>, ptr [[P1]], align 32
-; CHECK-NEXT:    [[BLEND:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 1, i32 0>
-; CHECK-NEXT:    ret <4 x double> [[BLEND]]
+; SSE-LABEL: define <4 x double> @blend_broadcasts_v4f64(
+; SSE-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] {
+; SSE-NEXT:    [[LD0:%.*]] = load <4 x double>, ptr [[P0]], align 32
+; SSE-NEXT:    [[LD1:%.*]] = load <4 x double>, ptr [[P1]], align 32
+; SSE-NEXT:    [[BLEND:%.*]] = shufflevector <4 x double> [[LD0]], <4 x double> [[LD1]], <4 x i32> <i32 0, i32 4, i32 4, i32 0>
+; SSE-NEXT:    ret <4 x double> [[BLEND]]
+;
+; AVX-LABEL: define <4 x double> @blend_broadcasts_v4f64(
+; AVX-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] {
+; AVX-NEXT:    [[LD0:%.*]] = load <4 x double>, ptr [[P0]], align 32
+; AVX-NEXT:    [[LD1:%.*]] = load <4 x double>, ptr [[P1]], align 32
+; AVX-NEXT:    [[BCST0:%.*]] = shufflevector <4 x double> [[LD0]], <4 x double> undef, <4 x i32> zeroinitializer
+; AVX-NEXT:    [[BCST1:%.*]] = shufflevector <4 x double> [[LD1]], <4 x double> undef, <4 x i32> zeroinitializer
+; AVX-NEXT:    [[BLEND:%.*]] = shufflevector <4 x double> [[BCST0]], <4 x double> [[BCST1]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+; AVX-NEXT:    ret <4 x double> [[BLEND]]
 ;
   %ld0 = load <4 x double>, ptr %p0, align 32
   %ld1 = load <4 x double>, ptr %p1, align 32
@@ -72,6 +81,3 @@ define <2 x float> @PR86068(<2 x float> %a0, <2 x float> %a1) {
   %s2 = shufflevector <2 x float> %s1, <2 x float> %a0, <2 x i32> <i32 0, i32 3>
   ret <2 x float> %s2
 }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; AVX: {{.*}}
-; SSE: {{.*}}
diff --git a/llvm/test/Transforms/VectorCombine/load-shufflevector.ll b/llvm/test/Transforms/VectorCombine/load-shufflevector.ll
deleted file mode 100644
index 467c20c5da0c2..0000000000000
--- a/llvm/test/Transforms/VectorCombine/load-shufflevector.ll
+++ /dev/null
@@ -1,392 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -passes=vector-combine -S < %s | FileCheck %s
-
-define <8 x half> @shuffle_v4_v8f16_r0_1_volatile(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr {
-; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_r0_1_volatile(
-; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x half>, ptr addrspace(1) [[ARG0]], align 32
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x half>...
[truncated]

github-actions · 2025-08-04T13:31:38Z

⚠️ undef deprecator found issues in your code. ⚠️

You can test this locally with the following command:

git diff -U0 --pickaxe-regex -S '([^a-zA-Z0-9#_-]undef[^a-zA-Z0-9_-]|UndefValue::get)' 'HEAD~1' HEAD llvm/lib/Transforms/Vectorize/VectorCombine.cpp llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll llvm/test/Transforms/VectorCombine/X86/load-widening.ll llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll

The following files introduce new uses of undef:

llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll

Undef is now deprecated and should only be used in the rare cases where no replacement is possible. For example, a load of uninitialized memory yields undef. You should use poison values for placeholders instead.

In tests, avoid using undef and having tests that trigger undefined behavior. If you need an operand with some unimportant value, you can add a new argument to the function and use that instead.

For example, this is considered a bad practice:

define void @fn() {
  ...
  br i1 undef, ...
}

Please use the following instead:

define void @fn(i1 %cond) {
  ...
  br i1 %cond, ...
}

Please refer to the Undefined Behavior Manual for more information.

Revert "[VectorCombine] Shrink loads used in shufflevector rebroadcas…

ed1f9d8

…ts (#128…" This reverts commit 1feed44.

llvmbot added clang Clang issues not falling into any other category vectorizers llvm:transforms llvm:vectorcombine labels Aug 4, 2025

RKSimon merged commit 88c6448 into main Aug 4, 2025
13 of 14 checks passed

RKSimon deleted the revert-128938-shrink_loads branch August 4, 2025 14:03

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Revert "[VectorCombine] Shrink loads used in shufflevector rebroadcasts" #151960

Revert "[VectorCombine] Shrink loads used in shufflevector rebroadcasts" #151960

Uh oh!

RKSimon commented Aug 4, 2025

Uh oh!

llvmbot commented Aug 4, 2025

Uh oh!

llvmbot commented Aug 4, 2025

Uh oh!

llvmbot commented Aug 4, 2025

Uh oh!

github-actions bot commented Aug 4, 2025

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants

Revert "[VectorCombine] Shrink loads used in shufflevector rebroadcasts" #151960

Revert "[VectorCombine] Shrink loads used in shufflevector rebroadcasts" #151960

Uh oh!

Conversation

RKSimon commented Aug 4, 2025

Uh oh!

llvmbot commented Aug 4, 2025

Uh oh!

llvmbot commented Aug 4, 2025

Uh oh!

llvmbot commented Aug 4, 2025

Uh oh!

github-actions bot commented Aug 4, 2025

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants